R Markdown
# Install and load the required package
library(gginference)
## Warning: package 'gginference' was built under R version 4.3.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.2 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.2
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
library(BSDA)
## Warning: package 'BSDA' was built under R version 4.3.2
## Loading required package: lattice
##
## Attaching package: 'BSDA'
##
## The following object is masked from 'package:datasets':
##
## Orange
data <- read.csv("Obesity.csv")
data <- rename(data,Frequent_consumption_of_high_caloric_food = FAVC,Frequency_of_consumption_of_vegetables = FCVC, Number_of_main_meals = NCP, Consumption_of_food_between_meals = CAEC, Consumption_of_water_daily = CH2O, Consumption_of_alcohol = CALC,Calories_consumption_monitoring = SCC, Physical_activity_frequency = FAF, Time_using_technology_devices = TUE, Transportation_used = MTRANS)
head(data)
## Gender Age Height Weight family_history_with_overweight
## 1 Female 21 1.62 64.0 yes
## 2 Female 21 1.52 56.0 yes
## 3 Male 23 1.80 77.0 yes
## 4 Male 27 1.80 87.0 no
## 5 Male 22 1.78 89.8 no
## 6 Male 29 1.62 53.0 no
## Frequent_consumption_of_high_caloric_food
## 1 no
## 2 no
## 3 no
## 4 no
## 5 no
## 6 yes
## Frequency_of_consumption_of_vegetables Number_of_main_meals
## 1 2 3
## 2 3 3
## 3 2 3
## 4 3 3
## 5 2 1
## 6 2 3
## Consumption_of_food_between_meals SMOKE Consumption_of_water_daily
## 1 Sometimes no 2
## 2 Sometimes yes 3
## 3 Sometimes no 2
## 4 Sometimes no 2
## 5 Sometimes no 2
## 6 Sometimes no 2
## Calories_consumption_monitoring Physical_activity_frequency
## 1 no 0
## 2 yes 3
## 3 no 2
## 4 no 2
## 5 no 0
## 6 no 0
## Time_using_technology_devices Consumption_of_alcohol Transportation_used
## 1 1 no Public_Transportation
## 2 0 Sometimes Public_Transportation
## 3 1 Frequently Public_Transportation
## 4 0 Frequently Walking
## 5 0 Sometimes Public_Transportation
## 6 0 Sometimes Automobile
## NObeyesdad
## 1 Normal_Weight
## 2 Normal_Weight
## 3 Normal_Weight
## 4 Overweight_Level_I
## 5 Overweight_Level_II
## 6 Normal_Weight
data$obesity_categories <- ifelse(data$NObeyesdad %in% c("Insufficient_Weight", "Normal_Weight"), "Normal", (ifelse(data$NObeyesdad %in% c("Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III"), "Obese", "Overweight")))
head(data)
## Gender Age Height Weight family_history_with_overweight
## 1 Female 21 1.62 64.0 yes
## 2 Female 21 1.52 56.0 yes
## 3 Male 23 1.80 77.0 yes
## 4 Male 27 1.80 87.0 no
## 5 Male 22 1.78 89.8 no
## 6 Male 29 1.62 53.0 no
## Frequent_consumption_of_high_caloric_food
## 1 no
## 2 no
## 3 no
## 4 no
## 5 no
## 6 yes
## Frequency_of_consumption_of_vegetables Number_of_main_meals
## 1 2 3
## 2 3 3
## 3 2 3
## 4 3 3
## 5 2 1
## 6 2 3
## Consumption_of_food_between_meals SMOKE Consumption_of_water_daily
## 1 Sometimes no 2
## 2 Sometimes yes 3
## 3 Sometimes no 2
## 4 Sometimes no 2
## 5 Sometimes no 2
## 6 Sometimes no 2
## Calories_consumption_monitoring Physical_activity_frequency
## 1 no 0
## 2 yes 3
## 3 no 2
## 4 no 2
## 5 no 0
## 6 no 0
## Time_using_technology_devices Consumption_of_alcohol Transportation_used
## 1 1 no Public_Transportation
## 2 0 Sometimes Public_Transportation
## 3 1 Frequently Public_Transportation
## 4 0 Frequently Walking
## 5 0 Sometimes Public_Transportation
## 6 0 Sometimes Automobile
## NObeyesdad obesity_categories
## 1 Normal_Weight Normal
## 2 Normal_Weight Normal
## 3 Normal_Weight Normal
## 4 Overweight_Level_I Overweight
## 5 Overweight_Level_II Overweight
## 6 Normal_Weight Normal
ggplot(data, aes(x = Weight, fill = NObeyesdad)) +
geom_density(alpha = 0.5) +
labs(title = "Density Plot of Age with Obesity Type",
x = "Age",
y = "Density") +
theme_minimal() +
facet_wrap(~NObeyesdad, scales = "free")

histogram <- plot_ly(data, x = ~obesity_categories, type = "histogram",color = ~family_history_with_overweight, facet_col = ~family_history_with_overweight,colors = c("steelblue","coral")) %>%
layout(title = "Histogram of Obesity Type",
xaxis = list(title = "Obesity Type"),
yaxis = list(title = "Number of people"),barmode = "stack")
# Display the histogram
histogram
## Warning: 'histogram' objects don't have these attributes: 'facet_col'
## Valid attributes include:
## '_deprecated', 'alignmentgroup', 'autobinx', 'autobiny', 'bingroup', 'cliponaxis', 'constraintext', 'cumulative', 'customdata', 'customdatasrc', 'error_x', 'error_y', 'histfunc', 'histnorm', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'insidetextanchor', 'insidetextfont', 'legendgroup', 'legendgrouptitle', 'legendrank', 'marker', 'meta', 'metasrc', 'name', 'nbinsx', 'nbinsy', 'offsetgroup', 'opacity', 'orientation', 'outsidetextfont', 'selected', 'selectedpoints', 'showlegend', 'stream', 'text', 'textangle', 'textfont', 'textposition', 'textsrc', 'texttemplate', 'transforms', 'type', 'uid', 'uirevision', 'unselected', 'visible', 'x', 'xaxis', 'xbins', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'yaxis', 'ybins', 'ycalendar', 'yhoverformat', 'ysrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
## Warning: 'histogram' objects don't have these attributes: 'facet_col'
## Valid attributes include:
## '_deprecated', 'alignmentgroup', 'autobinx', 'autobiny', 'bingroup', 'cliponaxis', 'constraintext', 'cumulative', 'customdata', 'customdatasrc', 'error_x', 'error_y', 'histfunc', 'histnorm', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'insidetextanchor', 'insidetextfont', 'legendgroup', 'legendgrouptitle', 'legendrank', 'marker', 'meta', 'metasrc', 'name', 'nbinsx', 'nbinsy', 'offsetgroup', 'opacity', 'orientation', 'outsidetextfont', 'selected', 'selectedpoints', 'showlegend', 'stream', 'text', 'textangle', 'textfont', 'textposition', 'textsrc', 'texttemplate', 'transforms', 'type', 'uid', 'uirevision', 'unselected', 'visible', 'x', 'xaxis', 'xbins', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'yaxis', 'ybins', 'ycalendar', 'yhoverformat', 'ysrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
box_plot <- plot_ly(data, y = ~Height, color = ~Gender, type = "violin", box = list(visible = TRUE),colors = c("darkviolet","darkgreen")) %>%
layout(title = "Box Plot of Height Gender-wise",
yaxis = list(title = "Height"),
xaxis = list(title = "Gender"))
# Display the box plot
box_plot
age_breaks <- c(0,25, 34,61)
age_labels <- c("youth (<25)","Young Adults (25-34)","Adults (>=35)")
data$AgeGroup <- cut(data$Age, breaks = age_breaks, labels = age_labels)
# avg_data <- aggregate(Time_using_technology_devices ~ AgeGroup, data = data, FUN = mean)
line_chart <- plot_ly(
type = "box",
x = data$AgeGroup,
y = data$Time_using_technology_devices,
)
line_chart <- line_chart %>% layout(title = "Boxplot of time using technology devices among various age groups",yaxis = list(title ="Age Groups"),yaxis = list(title ="Time_using_technology_devices"))
line_chart
heatmap_ggplot <- ggplot(data, aes(x = AgeGroup, y = Transportation_used, fill = Physical_activity_frequency)) +
geom_tile(fun = "max") +
labs(title = "Heatmap of Physical Activity frequency with respect to Age Group and Mode of transportation", x = "Age Groups", y = "Transportation Modes Used", fill = "Values")
## Warning in geom_tile(fun = "max"): Ignoring unknown parameters: `fun`
# Display the ggplot2 heatmap
print(heatmap_ggplot)

pie_chart <- plot_ly(
data,
labels = ~Calories_consumption_monitoring,
type = "pie"
)
pie_chart <- pie_chart %>%
layout(title = "Pie Chart of Percentage of people monitoring calorie consumtion")
# Display the pie chart
pie_chart
data$BMI = data$Weight/(data$Height**2)
ggplot()+geom_point(aes(x=data$Age,y=data$BMI,color = data$NObeyesdad))

# 1. Null hypothesis (H0): Mean height of Males and Females is Equal
# Alternative hypothesis (Ha): There is a difference between mean heights of Male and Female
# Conducting a two-tailed t-test
Male <- data %>% filter(Gender == "Male") %>% pull(Height)
summary(Male)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.560 1.705 1.760 1.759 1.807 1.980
Female <- data %>% filter(Gender == "Female") %>% pull(Height)
summary(Female)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.450 1.597 1.640 1.643 1.700 1.843
result <- t.test(x=Male, y=Female, alternative = "two.sided")
result
##
## Welch Two Sample t-test
##
## data: Male and Female
## t = 36.13, df = 2102.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1091292 0.1216558
## sample estimates:
## mean of x mean of y
## 1.758690 1.643298
ggttest(result)
## Warning: `geom_vline()`: Ignoring `data` because `xintercept` was provided.

# 2. Null hypothesis (H0): Proportion of smokers is more than 0.5
# Alternative hypothesis (Ha): Proportion of smokers is less than 0.5
data$SMOKE_Encoded <- ifelse(data$SMOKE == "yes", 1, 0)
# Conducting a one-tailed proportion test
prop_test_smokers <- prop.test(sum(data$SMOKE_Encoded), length(data$SMOKE_Encoded), p = 0.5, alternative = "less")
print(prop_test_smokers)
##
## 1-sample proportions test with continuity correction
##
## data: sum(data$SMOKE_Encoded) out of length(data$SMOKE_Encoded), null probability 0.5
## X-squared = 1936.8, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is less than 0.5
## 95 percent confidence interval:
## 0.00000000 0.02686893
## sample estimates:
## p
## 0.0208432
# 3. Null hypothesis (H0): Family history of obesity has almost no influence on Obesity level
# Alternative hypothesis (Ha): Family history of obesity influences Obesity level
# A bit about the chi squared test - To determine whether there is a significant correlation between two categorical variables, this test for independence is used. It compares the observed frequencies to those that would be expected under the assumption of independence using a contingency table. After normalizing the squared differences between the observed and expected frequencies, the test statistic Chi-squared (X²) is computed.
data$obesity_categories <- ifelse(data$NObeyesdad %in% c("Insufficient_Weight", "Normal_Weight"), "Normal", (ifelse(data$NObeyesdad %in% c("Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III"), "Obese", "Overweight")))
# Conducting a chi-squared test
contingency_table <- table(data$obesity_categories, data$family_history_with_overweight)
chi_squared_result <- chisq.test(contingency_table)
print("Chi-squared Test for Independence for Obesity category and Family history of Obesity:")
## [1] "Chi-squared Test for Independence for Obesity category and Family history of Obesity:"
print(chi_squared_result)
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 570.04, df = 2, p-value < 2.2e-16
ggchisqtest(chi_squared_result)

#Statistic calculation of weight and height
correlation <- cor(data$Weight, data$Height)
print(paste("Correlation between weight and height:", correlation))
## [1] "Correlation between weight and height: 0.463136116615627"
print(summary(data$BMI))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 24.33 28.72 29.70 36.02 50.81
#4. Null hypothesis (H0): Mean of people not monitoring calorie intake have higher BMI than the population mean
# Alternative hypothesis (Ha): Mean of people monitoring calorie intake have lower BMI than the population mean
# Conducting a one-tailed two sample t-test
Monitoring_no <- data %>% filter(Calories_consumption_monitoring == "no") %>% pull(BMI)
summary(Monitoring_no)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 24.77 29.30 30.02 36.21 50.81
Monitoring_yes <- data %>% filter(Calories_consumption_monitoring == "yes") %>% pull(BMI)
summary(Monitoring_yes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.79 19.32 24.19 22.94 25.39 36.16
result_monitoring <- t.test(x=Monitoring_no,y=Monitoring_yes, alternative = "less",mu=mean(data$BMI))
result_monitoring
##
## Welch Two Sample t-test
##
## data: Monitoring_no and Monitoring_yes
## t = -50.327, df = 133.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 29.70016
## 95 percent confidence interval:
## -Inf 7.828825
## sample estimates:
## mean of x mean of y
## 30.02233 22.93782
ggttest(result_monitoring)
## Warning: `geom_vline()`: Ignoring `data` because `xintercept` was provided.

#5. Null hypothesis (H0): Frequency of alcohol consumption has almost no influence on Obesity level
# Alternative hypothesis (Ha): Frequency of alcohol consumption influences Obesity level
# Conducting a chi-squared test
contingency_table <- table(data$obesity_categories, data$Consumption_of_alcohol)
chi_squared_result <- chisq.test(contingency_table)
## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect
print("Chi-squared Test for Independence for Obesity category and Family history of Obesity:")
## [1] "Chi-squared Test for Independence for Obesity category and Family history of Obesity:"
print(chi_squared_result)
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 70.548, df = 6, p-value = 3.157e-13
ggchisqtest(chi_squared_result)

# Classifying task
# Split the data set into training and testing sets
set.seed(456)
train_indices <- sample(1:nrow(data), 0.7 * nrow(data))
train_data <- data[train_indices, ]
test_data <- data[-train_indices, ]
# Train the Decision tree classifier
library(rpart)
print(names(data))
## [1] "Gender"
## [2] "Age"
## [3] "Height"
## [4] "Weight"
## [5] "family_history_with_overweight"
## [6] "Frequent_consumption_of_high_caloric_food"
## [7] "Frequency_of_consumption_of_vegetables"
## [8] "Number_of_main_meals"
## [9] "Consumption_of_food_between_meals"
## [10] "SMOKE"
## [11] "Consumption_of_water_daily"
## [12] "Calories_consumption_monitoring"
## [13] "Physical_activity_frequency"
## [14] "Time_using_technology_devices"
## [15] "Consumption_of_alcohol"
## [16] "Transportation_used"
## [17] "NObeyesdad"
## [18] "obesity_categories"
## [19] "AgeGroup"
## [20] "BMI"
## [21] "SMOKE_Encoded"
tree_model <- rpart(NObeyesdad ~ Gender + Age + Height + Weight+ family_history_with_overweight + Frequent_consumption_of_high_caloric_food + Frequency_of_consumption_of_vegetables + Consumption_of_water_daily+ Number_of_main_meals + Consumption_of_food_between_meals + SMOKE+ Calories_consumption_monitoring +Physical_activity_frequency+ Time_using_technology_devices + Consumption_of_alcohol +Transportation_used, data = train_data)
predictions <- predict(tree_model, test_data, type = "class")
# Calculate accuracy
correct_predictions <- sum(predictions == test_data$NObeyesdad)
total_samples <- length(predictions)
accuracy <- correct_predictions / total_samples
# Print accuracy
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 86.44 %"